library(tidyverse)
library(data.table)
library(gdata)
library(CLIPanalyze)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(plotly)
dir.create("PDF_figure/CDF_zscore_merged", showWarnings = FALSE)
DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/Differential Analysis.csv")
## New names:
## * `` -> ...1
## Rows: 10962 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ...1
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(DGE)[1] <- "EnsemblID"
DGE_pro <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/ceMS_diff.csv")
## New names:
## * `` -> ...1
## * ...2 -> ...3
## Rows: 8161 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Protein Id, ...3, Gene Symbol
## dbl (5): ...1, p_values, q_values, foldChange, LFC
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(DGE_pro)[2] <- "UniprotID"
mir.targets <- readRDS("Datafiles/miRNA-merged-peaks-list-12232019-withIDs.rds")
Compute GSEA z-scores using the PAGE method
zscore.cal <- function(genes = NA, index = NA, expression.dataset){
if (sum(!is.na(genes)) > 0 && is.na(index)){
gene.set <- expression.dataset[expression.dataset$EnsemblID %in% genes,]
} else if (!is.na(index)) {
gene.set <- expression.dataset[index,]
} else {
print("Please input gene names or row index.")
}
mu <- mean(expression.dataset$log2FoldChange)
Sm <- mean(gene.set$log2FoldChange)
m <- length(genes)
SD <- sd(expression.dataset$log2FoldChange)
z <- (Sm-mu)*sqrt(m)/SD
return(z)
}
zscores.all <- as.data.frame(sapply(mir.targets,
function(targets){
zscore.cal(genes = targets$target_Ensembl_ID, expression.dataset = DGE)
}))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mir.targets, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"
p_rna <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
xlab("RNA-Seq Z-score") +
ylab("miRNA family abundance") +
theme_bw() +
theme(panel.border = element_rect(),
panel.background = element_blank(),
panel.grid.major = element_line(),
panel.grid.minor = element_line(),
axis.title.x = element_text(size=14, margin = margin(t = 10)),
axis.title.y = element_text(size=14, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_line(size = 0.5),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
ggplotly(p_rna)
pdf("PDF_figure/CDF_zscore_merged/RNA_CDF_zscore.pdf",
height = 4,
width = 6)
ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
xlab("RNA-Seq Z-score") +
ylab("miRNA family abundance") +
theme_bw() +
theme(panel.border = element_rect(),
panel.background = element_blank(),
panel.grid.major = element_line(),
panel.grid.minor = element_line(),
axis.title.x = element_text(size=14, margin = margin(t = 10)),
axis.title.y = element_text(size=14, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_line(size = 0.5),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
dev.off()
## quartz_off_screen
## 2
cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
##
## Pearson's product-moment correlation
##
## data: zscores.all$z.score and log2(zscores.all$baseMean)
## t = 4.4332, df = 73, p-value = 3.213e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2608688 0.6224583
## sample estimates:
## cor
## 0.4605621
Compute GSEA z-scores using the PAGE method
zscore.cal <- function(genes = NA, index = NA, expression.dataset){
if (sum(!is.na(genes)) > 0 && is.na(index)){
gene.set <- expression.dataset[expression.dataset$UniprotID %in% genes,]
} else if (!is.na(index)) {
gene.set <- expression.dataset[index,]
} else {
print("Please input gene names or row index.")
}
mu <- mean(expression.dataset$LFC)
Sm <- mean(gene.set$LFC)
m <- length(genes)
SD <- sd(expression.dataset$LFC)
z <- (Sm-mu)*sqrt(m)/SD
return(z)
}
zscores.all <- as.data.frame(sapply(mir.targets,
function(targets){
zscore.cal(genes = targets$target_Uniprot_ID, expression.dataset = DGE_pro)
}))
colnames(zscores.all) <- c("z.score")
lens <- as.data.frame(sapply(mir.targets, function(x) length(x)))
colnames(lens) <- "N"
zscores.all <- cbind(zscores.all, lens$N)
zscores.all <- as_tibble(zscores.all, rownames = "miR.family")
zscores.all <- inner_join(zscores.all, mirna.family.DGE[,c(1,7)], by = c("miR.family" = "miR.family"))
colnames(zscores.all)[3] <- "N"
p_prortein <- ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N, label = miR.family)) +
geom_point(colour = "#1C75BB", alpha = 0.6) +
xlab("Proteomics Z-score") +
ylab("miRNA family abundance") +
theme_bw() +
theme(panel.border = element_rect(),
panel.background = element_blank(),
panel.grid.major = element_line(),
panel.grid.minor = element_line(),
axis.title.x = element_text(size=14, margin = margin(t = 10)),
axis.title.y = element_text(size=14, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_line(size = 0.5),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
ggplotly(p_prortein)
pdf("PDF_figure/CDF_zscore_merged/Protein_CDF_zscore.pdf",
height = 4,
width = 6)
ggplot(zscores.all, aes(x = z.score, y = log2(baseMean), size = N)) +
geom_point(colour = "#1C75BB", alpha = 0.6) +
xlab("Proteomics Z-score") +
ylab("miRNA family abundance") +
theme_bw() +
theme(panel.border = element_rect(),
panel.background = element_blank(),
panel.grid.major = element_line(),
panel.grid.minor = element_line(),
axis.title.x = element_text(size=14, margin = margin(t = 10)),
axis.title.y = element_text(size=14, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_line(size = 0.5),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
dev.off()
## quartz_off_screen
## 2
cor.test(zscores.all$z.score, log2(zscores.all$baseMean))
##
## Pearson's product-moment correlation
##
## data: zscores.all$z.score and log2(zscores.all$baseMean)
## t = 4.562, df = 73, p-value = 2.001e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2732600 0.6305638
## sample estimates:
## cor
## 0.47101
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] plotly_4.10.0 RColorBrewer_1.1-2
## [3] biomaRt_2.48.3 VennDiagram_1.7.0
## [5] futile.logger_1.4.3 BSgenome_1.60.0
## [7] rtracklayer_1.52.1 CLIPanalyze_0.0.10
## [9] DESeq2_1.32.0 GenomicAlignments_1.28.0
## [11] Rsamtools_2.8.0 Biostrings_2.60.2
## [13] XVector_0.32.0 SummarizedExperiment_1.22.0
## [15] MatrixGenerics_1.4.3 matrixStats_0.61.0
## [17] GenomicFeatures_1.44.2 AnnotationDbi_1.54.1
## [19] Biobase_2.52.0 GenomicRanges_1.44.0
## [21] GenomeInfoDb_1.28.4 IRanges_2.26.0
## [23] S4Vectors_0.30.2 BiocGenerics_0.38.0
## [25] plyr_1.8.6 gdata_2.18.0
## [27] data.table_1.14.2 forcats_0.5.1
## [29] stringr_1.4.0 dplyr_1.0.7
## [31] purrr_0.3.4 readr_2.1.0
## [33] tidyr_1.1.4 tibble_3.1.6
## [35] ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 rjson_0.2.20 ellipsis_0.3.2
## [4] fs_1.5.0 rstudioapi_0.13 farver_2.1.0
## [7] bit64_4.0.5 fansi_0.5.0 lubridate_1.8.0
## [10] xml2_1.3.2 splines_4.1.1 cachem_1.0.6
## [13] geneplotter_1.70.0 knitr_1.36 jsonlite_1.7.2
## [16] broom_0.7.10 annotate_1.70.0 dbplyr_2.1.1
## [19] png_0.1-7 compiler_4.1.1 httr_1.4.2
## [22] biosignals_0.1.0 backports_1.4.0 lazyeval_0.2.2
## [25] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [28] cli_3.1.0 formatR_1.11 htmltools_0.5.2
## [31] prettyunits_1.1.1 tools_4.1.1 gtable_0.3.0
## [34] glue_1.5.0 GenomeInfoDbData_1.2.6 rappdirs_0.3.3
## [37] Rcpp_1.0.7 cellranger_1.1.0 jquerylib_0.1.4
## [40] vctrs_0.3.8 crosstalk_1.2.0 xfun_0.28
## [43] rvest_1.0.2 lifecycle_1.0.1 restfulr_0.0.13
## [46] gtools_3.9.2 XML_3.99-0.8 zlibbioc_1.38.0
## [49] scales_1.1.1 vroom_1.5.6 hms_1.1.1
## [52] lambda.r_1.2.4 yaml_2.2.1 curl_4.3.2
## [55] memoise_2.0.1 stringi_1.7.5 RSQLite_2.2.8
## [58] genefilter_1.74.1 BiocIO_1.2.0 filelock_1.0.2
## [61] BiocParallel_1.26.2 rlang_0.4.12 pkgconfig_2.0.3
## [64] bitops_1.0-7 evaluate_0.14 lattice_0.20-45
## [67] labeling_0.4.2 htmlwidgets_1.5.4 bit_4.0.4
## [70] tidyselect_1.1.1 magrittr_2.0.1 R6_2.5.1
## [73] generics_0.1.1 DelayedArray_0.18.0 DBI_1.1.1
## [76] pillar_1.6.4 haven_2.4.3 withr_2.4.2
## [79] survival_3.2-13 KEGGREST_1.32.0 RCurl_1.98-1.5
## [82] modelr_0.1.8 crayon_1.4.2 futile.options_1.0.1
## [85] utf8_1.2.2 BiocFileCache_2.0.0 tzdb_0.2.0
## [88] rmarkdown_2.11 progress_1.2.2 locfit_1.5-9.4
## [91] readxl_1.3.1 blob_1.2.2 reprex_2.0.1
## [94] digest_0.6.28 xtable_1.8-4 munsell_0.5.0
## [97] viridisLite_0.4.0